# -*- coding: utf-8 -*-
import re
from common import get_from_yd
from common import sql_script


def get_cet4():
    words = open("cet4.txt")
    p = re.compile(r"^(\w+)\s\w+\.(.+)")
    allist = {}
    for line in words:
        m = p.match(line)
        if m:
            allist[m.groups()[0]] = re.sub(r"\s\w+\.", '；', m.groups()[1])
    words.close()
    return allist

def list2string(re_list):
    res = ""
    for i in re_list:
        if res:
            res += ";"
        res += i[0]
    return res

def get_cet6():
    #cet4 = get_cet4()
    f = open("CET6.txt")
    p = re.compile(r"([\w\(\)]+)\s+(.+)")
    cet6 = {}
    for line in f:
        m = p.search(line)
        if m:
            #if cet4.has_key(m.groups()[0]):
            zh = m.groups()[1]
            p_zh = re.compile(r"\w+\.(.+?)(\xef\xbc\x9b|\xef\xbc\x8c|\s|$)")
            n = p_zh.findall(zh)
            cet6[m.groups()[0]] = list2string(n)
        else:
            print line.strip()
    return cet6
def main():
    cet4 = get_cet4().items()
    cet6 = get_cet6().items()
    #for i in cet6:
    #    get_from_yd.get_audio(i[0], "D:\Git\python-lite\cet6")
    #sql_script.insert_list(cet4, "60.205.94.133", 46, "CET-6词汇(1)", 3117)
    sql_script.insert_list(cet6, "60.205.94.133", 47, "CET-6词汇(2)", 3440)
    print len(cet4)
    print len(cet6)


if __name__ == '__main__':
    main()
